import json
import time

import requests
from bs4 import BeautifulSoup


#获取所有页面的html

def down_load_page():
    htmls = []

    for idx in range(5):
        url = f"http://www.crazyant.net/page/{idx+1}"

        response = requests.get( url)

        if response.status_code == 200:
            print(f"正在爬取第{idx}页......")
            time.sleep(1)
        else:
            raise Exception("请求失败")
        htmls.append(response.text)
    return htmls



#获取每页需要查询的数据
def parse_html(html):
    soup = BeautifulSoup(html, "html.parser")
    ariticles = soup.find_all("article")
    datas = []
    for article in ariticles:
        title_node = article.select_one("h2.entry-title a")
        #获取到a标签
        # title_node = (
        #     article.find("h2", class_="entry-title").
        #     find("a")
        # )
        #获取标题
        title = title_node.get_text()
        #获取链接
        link = title_node["href"]
        datas.append({
            "title": title,
            "link": link
        })
    return datas

if __name__ == "__main__":
    htmls = down_load_page()
    all_datas = []
    for html in htmls:
        all_datas.extend(parse_html(html))
    with open("data.json", "w", encoding="utf-8") as f:
        json.dump(all_datas, f, ensure_ascii=False ,indent=4,separators=(",", ": "))
